library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(mice)
## 
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
## 
##     filter
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.8     ✔ stringr 1.4.1
## ✔ tidyr   1.2.1     ✔ forcats 0.5.2
## ✔ readr   2.1.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ mice::filter() masks dplyr::filter(), stats::filter()
## ✖ dplyr::lag()   masks stats::lag()
library(readr)
# For NLP
library(tidyverse) # metapackage with lots of helpful functions
library(ggplot2)
library(readr)
library(dplyr)
library(tidyr)
library(tidytext)
library(RColorBrewer)
library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
#install.packages("wordcloud")
library(wordcloud)
#install.packages("igraph")
library(igraph)
## 
## Attaching package: 'igraph'
## The following objects are masked from 'package:purrr':
## 
##     compose, simplify
## The following object is masked from 'package:tidyr':
## 
##     crossing
## The following object is masked from 'package:tibble':
## 
##     as_data_frame
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
#install.packages("widyr")
library(widyr)
#install.packages("ggraph")
library(ggraph)
#install.packages("ngram")
#install.packages("wordcloud2")
library(ngram)
library(wordcloud2)
setwd("C:/VITc/5th Sem/CSE3505, Foundtns. Of Data Analytics/FDA_Proj")

data=read.csv('C:/VITc/5th Sem/CSE3505, Foundtns. Of Data Analytics/FDA_Proj/Final/aa_songdata.csv')

PreProcessing

head(data,n=2)
##   artist                  song                                       link
## 1   ABBA Ahe's My Kind Of Girl /a/abba/ahes+my+kind+of+girl_20598417.html
## 2   ABBA      Andante, Andante      /a/abba/andante+andante_20002708.html
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              text
## 1                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   Look at her face, it's a wonderful face  \nAnd it means something special to me  \nLook at the way that she smiles when she sees me  \nHow lucky can one fellow be?  \n  \nShe's just my kind of girl, she makes me feel fine  \nWho could ever believe that she could be mine?  \nShe's just my kind of girl, without her I'm blue  \nAnd if she ever leaves me what could I do, what could I do?  \n  \nAnd when we go for a walk in the park  \nAnd she holds me and squeezes my hand  \nWe'll go on walking for hours and talking  \nAbout all the things that we plan  \n  \nShe's just my kind of girl, she makes me feel fine  \nWho could ever believe that she could be mine?  \nShe's just my kind of girl, without her I'm blue  \nAnd if she ever leaves me what could I do, what could I do?\n\n
## 2 Take it easy with me, please  \nTouch me gently like a summer evening breeze  \nTake your time, make it slow  \nAndante, Andante  \nJust let the feeling grow  \n  \nMake your fingers soft and light  \nLet your body be the velvet of the night  \nTouch my soul, you know how  \nAndante, Andante  \nGo slowly with me now  \n  \nI'm your music  \n(I am your music and I am your song)  \nI'm your song  \n(I am your music and I am your song)  \nPlay me time and time again and make me strong  \n(Play me again 'cause you're making me strong)  \nMake me sing, make me sound  \n(You make me sing and you make me)  \nAndante, Andante  \nTread lightly on my ground  \nAndante, Andante  \nOh please don't let me down  \n  \nThere's a shimmer in your eyes  \nLike the feeling of a thousand butterflies  \nPlease don't talk, go on, play  \nAndante, Andante  \nAnd let me float away  \n  \nI'm your music  \n(I am your music and I am your song)  \nI'm your song  \n(I am your music and I am your song)  \nPlay me time and time again and make me strong  \n(Play me again 'cause you're making me strong)  \nMake me sing, make me sound  \n(You make me sing and you make me)  \nAndante, Andante  \nTread lightly on my ground  \nAndante, Andante  \nOh please don't let me down  \n  \nMake me sing, make me sound  \n(You make me sing and you make me)  \nAndante, Andante  \nTread lightly on my ground  \nAndante, Andante  \nOh please don't let me down  \nAndante, Andante  \nOh please don't let me down\n\n
## Viewing summaries of the datasets
summary(data)
##     artist              song               link               text          
##  Length:57650       Length:57650       Length:57650       Length:57650      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character

Checking Null Values

#is.na(data)
sum(is.na(data))
## [1] 0
head(data,n=1)
##   artist                  song                                       link
## 1   ABBA Ahe's My Kind Of Girl /a/abba/ahes+my+kind+of+girl_20598417.html
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            text
## 1 Look at her face, it's a wonderful face  \nAnd it means something special to me  \nLook at the way that she smiles when she sees me  \nHow lucky can one fellow be?  \n  \nShe's just my kind of girl, she makes me feel fine  \nWho could ever believe that she could be mine?  \nShe's just my kind of girl, without her I'm blue  \nAnd if she ever leaves me what could I do, what could I do?  \n  \nAnd when we go for a walk in the park  \nAnd she holds me and squeezes my hand  \nWe'll go on walking for hours and talking  \nAbout all the things that we plan  \n  \nShe's just my kind of girl, she makes me feel fine  \nWho could ever believe that she could be mine?  \nShe's just my kind of girl, without her I'm blue  \nAnd if she ever leaves me what could I do, what could I do?\n\n

SIngers freq.

library(wordcloud)
wordcloud2(song_freq[1:600,],size = .5)
library(tidyr)
library(tidytext)
tidy_lyrics<-tidy_lyrics <- song%>% unnest_tokens(word,text)
head(tidy_lyrics,n=3)
##   artist                  song
## 1   ABBA Ahe's My Kind Of Girl
## 2   ABBA Ahe's My Kind Of Girl
## 3   ABBA Ahe's My Kind Of Girl
##                                                                    link word
## 1 https://www.lyricsfreak.com/a/abba/ahes+my+kind+of+girl_20598417.html look
## 2 https://www.lyricsfreak.com/a/abba/ahes+my+kind+of+girl_20598417.html   at
## 3 https://www.lyricsfreak.com/a/abba/ahes+my+kind+of+girl_20598417.html  her
song_wrd_count<-tidy_lyrics %>%count(song)
head(song_wrd_count,n=3)
##                          song   n
## 1                     - Human 114
## 2   (Ain't That) Just Like Me 262
## 3 (all I Can Do Is) Dream You 117

Visualizing more features.

# Counting total no. of words
lyric_counts <- tidy_lyrics%>%
  left_join(song_wrd_count, by =
              "song")%>%rename(total_words=n)
tail(lyric_counts,n=1)
##          artist      song
## 12700137   Zwan Heartsong
##                                                                link word
## 12700137 https://www.lyricsfreak.com/z/zwan/heartsong_20148991.html less
##          total_words
## 12700137         386
song_wrd_count %>%
  arrange(desc(n))%>%top_n(n=10)%>%
  ggplot(aes(x=factor(song,levels=song),y=n))+
  geom_col(col="yellow",fill="blue",size=1)+
  labs(x="song",y="word count",
       title="Words per song-Top 10")
## Selecting by n

song_wrd_count %>%
  arrange(desc(n))%>%tail(n=10)%>%
  ggplot(aes(x=factor(song,levels=song),y=n))+
  geom_col(col="green",fill="blue",size=1)+
  labs(x="song",y="word count",title="Songs, which have very less words")+
  theme(axis.text.x = element_text(angle=90))

song_wrd_count %>% arrange(desc(n))%>%tail(n=10)%>%ggplot(aes(x=factor(song,levels=song),y=n))+geom_col(col="yellow",fill="darkorange",size=1)+labs(x="song",y="word count",title="Which song has very less words")+theme(axis.text.x = element_text(angle=90))

#install.packages("textdata")
library(textdata)

#textdata::lexicon_afinn(manual_download = TRUE) 
get_sentiments("afinn")
## # A tibble: 2,477 × 2
##    word       value
##    <chr>      <dbl>
##  1 abandon       -2
##  2 abandoned     -2
##  3 abandons      -2
##  4 abducted      -2
##  5 abduction     -2
##  6 abductions    -2
##  7 abhor         -3
##  8 abhorred      -3
##  9 abhorrent     -3
## 10 abhors        -3
## # … with 2,467 more rows
lyric_counts <- tidy_lyrics%>%
  left_join(song_wrd_count, by ="song")%>%
  rename(total_words=n)
# library(tidytext)
# get_sentiments("nrc")
# lyric_sentiment<-try %>%
#   inner_join(get_sentiments("nrc"),by="word")
#remotes::install_github("EmilHvitfeldt/textdata")
library(remotes)
#install_github("EmilHvitfeldt/textdata")
#install_github("juliasilge/tidytext")
lyric_sentiment<-tidy_lyrics %>% inner_join(get_sentiments("nrc"),by="word")
head(lyric_sentiment)
##   artist                  song
## 1   ABBA Ahe's My Kind Of Girl
## 2   ABBA Ahe's My Kind Of Girl
## 3   ABBA Ahe's My Kind Of Girl
## 4   ABBA Ahe's My Kind Of Girl
## 5   ABBA Ahe's My Kind Of Girl
## 6   ABBA Ahe's My Kind Of Girl
##                                                                    link
## 1 https://www.lyricsfreak.com/a/abba/ahes+my+kind+of+girl_20598417.html
## 2 https://www.lyricsfreak.com/a/abba/ahes+my+kind+of+girl_20598417.html
## 3 https://www.lyricsfreak.com/a/abba/ahes+my+kind+of+girl_20598417.html
## 4 https://www.lyricsfreak.com/a/abba/ahes+my+kind+of+girl_20598417.html
## 5 https://www.lyricsfreak.com/a/abba/ahes+my+kind+of+girl_20598417.html
## 6 https://www.lyricsfreak.com/a/abba/ahes+my+kind+of+girl_20598417.html
##        word sentiment
## 1 wonderful       joy
## 2 wonderful  positive
## 3 wonderful  surprise
## 4 wonderful     trust
## 5   special       joy
## 6   special  positive


```r
lyric_sentiment %>%filter(!sentiment %in% c("positive","negative"))%>%count(word,sentiment,sort=TRUE)%>%group_by(sentiment)%>%top_n(n=10)%>%ungroup() %>%
ggplot(aes(x=reorder(word,n),y=n,fill=sentiment))+geom_col(show.legend = FALSE)+facet_wrap(~sentiment,scales="free")+coord_flip()
## Selecting by n

lyric_sentiment %>%count(song,sentiment,sort=TRUE)%>%group_by(sentiment)%>%top_n(n=5)%>%ggplot(aes(x=reorder(song,n),y=n,fill=sentiment))+geom_bar(stat="identity",show.legend = FALSE)+facet_wrap(~sentiment,scales="free")+coord_flip()
## Selecting by n

lyric_sentiment %>%count(artist,sentiment,sort=TRUE)%>%group_by(sentiment)%>%filter(sentiment %in% c("joy","sadness","anger"))%>% top_n(n=5)%>%ggplot(aes(x=reorder(artist,n),y=n,fill=sentiment))+geom_bar(stat="identity",show.legend = FALSE)+facet_wrap(~sentiment,scales="free")+coord_flip()
## Selecting by n

nc<-get_sentiments("nrc")
unique(nc)
## # A tibble: 13,872 × 2
##    word        sentiment
##    <chr>       <chr>    
##  1 abacus      trust    
##  2 abandon     fear     
##  3 abandon     negative 
##  4 abandon     sadness  
##  5 abandoned   anger    
##  6 abandoned   fear     
##  7 abandoned   negative 
##  8 abandoned   sadness  
##  9 abandonment anger    
## 10 abandonment fear     
## # … with 13,862 more rows
song_lex<-tidy_lyrics %>%inner_join(nc,by="word")
head(song_lex)
##   artist                  song
## 1   ABBA Ahe's My Kind Of Girl
## 2   ABBA Ahe's My Kind Of Girl
## 3   ABBA Ahe's My Kind Of Girl
## 4   ABBA Ahe's My Kind Of Girl
## 5   ABBA Ahe's My Kind Of Girl
## 6   ABBA Ahe's My Kind Of Girl
##                                                                    link
## 1 https://www.lyricsfreak.com/a/abba/ahes+my+kind+of+girl_20598417.html
## 2 https://www.lyricsfreak.com/a/abba/ahes+my+kind+of+girl_20598417.html
## 3 https://www.lyricsfreak.com/a/abba/ahes+my+kind+of+girl_20598417.html
## 4 https://www.lyricsfreak.com/a/abba/ahes+my+kind+of+girl_20598417.html
## 5 https://www.lyricsfreak.com/a/abba/ahes+my+kind+of+girl_20598417.html
## 6 https://www.lyricsfreak.com/a/abba/ahes+my+kind+of+girl_20598417.html
##        word sentiment
## 1 wonderful       joy
## 2 wonderful  positive
## 3 wonderful  surprise
## 4 wonderful     trust
## 5   special       joy
## 6   special  positive
backu=song_lex
song_sent<-song_lex %>%count(song,sentiment)
tail(song_sent)
##               song sentiment  n
## 396040 Zor And Zam       joy  3
## 396041 Zor And Zam  negative 10
## 396042 Zor And Zam  positive  6
## 396043 Zor And Zam   sadness  2
## 396044 Zor And Zam  surprise  2
## 396045 Zor And Zam     trust  3
song_sent%>%filter(sentiment=="joy")%>%arrange(desc(n))%>%head(10)%>%ggplot(aes(x=reorder(song,n),y=n))+geom_col(fill="orange")+labs(title="Top Songs - Joy words",x="song",y="+ve Word Count")+coord_flip()

song_sent%>%filter(sentiment=="sadness")%>%arrange(desc(n))%>%head(10)%>%ggplot(aes(x=reorder(song,n),y=n))+geom_col(fill="red")+labs(title="Top Songs - sad words",x="song",y="+ve Word Count")+coord_flip()

uncommon_wrd<-tidy_lyrics%>%count(song,word)%>%bind_tf_idf(word, song, n)%>%arrange(desc(tf_idf))
head(uncommon_wrd)
##                                       song        word   n        tf       idf
## 1                              Starfuckers starfuckers  23 0.6216216 10.017307
## 2 Chee-Chee Oo Chee (Sang The Little Bird)        chee 153 0.4608434 10.017307
## 3                             Boku Wa Kuma        kuma  38 0.3392857 10.710454
## 4                  Real Good Time Together          na 144 0.6824645  4.660721
## 5                          Kicker Of Elves         dee  70 0.5000000  6.210645
## 6                                  Kurushi     kurushi  20 0.2898551 10.710454
##     tf_idf
## 1 6.226975
## 2 4.616410
## 3 3.633904
## 4 3.180776
## 5 3.105322
## 6 3.104480
uncommon_wrd %>%arrange(desc(tf_idf))%>%head(20)%>%
ggplot(aes(x=word,y=tf_idf,fill=song))+geom_col()+labs(x="words",title="top 20- Associated words to songs in Lyrics")+theme(axis.text.x=element_text(angle=90))

tidy_lyrics %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("#F8766D", "#00BFC4"),
                   max.words = 300)
## Joining, by = "word"

lyrics_bigram <- unnest_tokens(data, input = text, output = bigram, token = "ngrams", n=2)
head(lyrics_bigram)
##   artist                  song
## 1   ABBA Ahe's My Kind Of Girl
## 2   ABBA Ahe's My Kind Of Girl
## 3   ABBA Ahe's My Kind Of Girl
## 4   ABBA Ahe's My Kind Of Girl
## 5   ABBA Ahe's My Kind Of Girl
## 6   ABBA Ahe's My Kind Of Girl
##                                                                    link
## 1 https://www.lyricsfreak.com/a/abba/ahes+my+kind+of+girl_20598417.html
## 2 https://www.lyricsfreak.com/a/abba/ahes+my+kind+of+girl_20598417.html
## 3 https://www.lyricsfreak.com/a/abba/ahes+my+kind+of+girl_20598417.html
## 4 https://www.lyricsfreak.com/a/abba/ahes+my+kind+of+girl_20598417.html
## 5 https://www.lyricsfreak.com/a/abba/ahes+my+kind+of+girl_20598417.html
## 6 https://www.lyricsfreak.com/a/abba/ahes+my+kind+of+girl_20598417.html
##        bigram
## 1     look at
## 2      at her
## 3    her face
## 4   face it's
## 5      it's a
## 6 a wonderful
bigram_filtered<-lyrics_bigram %>%separate(bigram,c("word1","word2",sep=" "))%>%
filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word)
## Warning: Expected 3 pieces. Additional pieces discarded in 6130 rows [10922,
## 10923, 12319, 19711, 19712, 26083, 26086, 29267, 29353, 37897, 37898, 38865,
## 38867, 38868, 40451, 40452, 43539, 43674, 43675, 52558, ...].
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 11298857 rows [1,
## 2, 3, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, ...].
head(bigram_filtered)
##   artist                  song
## 1   ABBA Ahe's My Kind Of Girl
## 2   ABBA Ahe's My Kind Of Girl
## 3   ABBA      Andante, Andante
## 4   ABBA      Andante, Andante
## 5   ABBA      Andante, Andante
## 6   ABBA      Andante, Andante
##                                                                    link   word1
## 1 https://www.lyricsfreak.com/a/abba/ahes+my+kind+of+girl_20598417.html    feel
## 2 https://www.lyricsfreak.com/a/abba/ahes+my+kind+of+girl_20598417.html    feel
## 3      https://www.lyricsfreak.com/a/abba/andante+andante_20002708.html  summer
## 4      https://www.lyricsfreak.com/a/abba/andante+andante_20002708.html evening
## 5      https://www.lyricsfreak.com/a/abba/andante+andante_20002708.html    slow
## 6      https://www.lyricsfreak.com/a/abba/andante+andante_20002708.html andante
##     word2     
## 1    fine <NA>
## 2    fine <NA>
## 3 evening <NA>
## 4  breeze <NA>
## 5 andante <NA>
## 6 andante <NA>
# Ram issue.  https://www.kaggle.com/code/srisudheera/nlp-song-data-set/notebook
# bigram_united <- bigram_filtered %>%unite(bigram, word1, word2, sep = " ")
# head(bigram_united)
# bigram_counts <- bigram_united %>% count(bigram, sort = TRUE)
# head(bigram_counts)
# bigram_counts %>% arrange(desc(n))%>% head(20)%>%ggplot(aes(x=factor(bigram,levels=bigram),y=n))+geom_bar(stat="identity",fill="#FF3E45")+labs(title="Top 20 bigram words in Songs")+coord_flip()

Now, for Million Song Dataset.

library(dplyr)
library(magrittr)
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract
library(stringr)
library(tidyr)
library(knitr)
#install.packages("kableExtra")
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
library(ggplot2)
library(devtools)
## Loading required package: usethis
## 
## Attaching package: 'usethis'
## The following object is masked from 'package:remotes':
## 
##     git_credentials
## 
## Attaching package: 'devtools'
## The following objects are masked from 'package:remotes':
## 
##     dev_package_deps, install_bioc, install_bitbucket, install_cran,
##     install_deps, install_dev, install_git, install_github,
##     install_gitlab, install_local, install_svn, install_url,
##     install_version, update_packages
#devtools::install_github("nicolewhite/RNeo4j")
#install.packages("RNeo4j")
#library(RNeo4j)
library(recommenderlab)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## Loading required package: arules
## 
## Attaching package: 'arules'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following objects are masked from 'package:base':
## 
##     abbreviate, write
## Loading required package: proxy
## 
## Attaching package: 'proxy'
## The following object is masked from 'package:Matrix':
## 
##     as.matrix
## The following objects are masked from 'package:stats':
## 
##     as.dist, dist
## The following object is masked from 'package:base':
## 
##     as.matrix
## Loading required package: registry
## Registered S3 methods overwritten by 'registry':
##   method               from 
##   print.registry_field proxy
##   print.registry_entry proxy
## 
## Attaching package: 'recommenderlab'
## The following objects are masked from 'package:igraph':
## 
##     normalize, similarity
#install.packages("psych")
library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(rstudioapi)
library(knitr)
library(kableExtra)
# install.packages("C:/Users/SSD/Downloads/RNeo4j-1.6.1.tar.gz", repos=NULL, type="source")
# library(RNeo4j)

The Data analytics approach

# Reading in the ratings dataframe and rename the columns
#, link now working, 
#u1 <- "https://static.turi.com/datasets/millionsong/10000.txt"
df1 <- as.data.frame(read.table("10000.txt", header = F, stringsAsFactors = F))
# Adding the column names
names(df1) <- c("user_id", "song_id", "listen_count")
# Read in the metadata dataframe
#u2 <- "https://static.turi.com/datasets/millionsong/song_data.csv"
metadata <- as.data.frame(read.csv("MSD_song_data.csv", header = T, sep = ",", stringsAsFactors = F))
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec, :
## EOF within quoted string
head(metadata)
##              song_id
## 1 SOQMMHC12AB0180CB8
## 2 SOVFVAK12A8C1350D9
## 3 SOGTUKN12AB017F4F1
## 4 SOBNYVR12A8C13558C
## 5 SOHSBXH12A8C13B0DF
## 6 SOZVAPQ12A8C13B63C
##                                                            title
## 1                                                   Silent Night
## 2                                                    Tanssi vaan
## 3                                              No One Could Ever
## 4                                                  Si Vos Querés
## 5                                               Tangle Of Aspens
## 6 Symphony No. 1 G minor "Sinfonie Serieuse"/Allegro con energia
##                                release      artist_name year
## 1                Monster Ballads X-Mas Faster Pussy cat 2003
## 2                          Karkuteillä Karkkiautomaatti 1995
## 3                               Butter   Hudson Mohawke 2006
## 4                              De Culo      Yerba Brava 2003
## 5 Rene Ablaze Presents Winter Sessions       Der Mystic    0
## 6     Berwald: Symphonies Nos. 1/2/3/4 David Montgomery    0
# Joining the two datasets
# Join data by song ID. Remove duplicate song ratings.
joined <- distinct(inner_join(df1, metadata, by = "song_id"))


# Group and summarize joined dataframe by user ID
grouped_id <- joined %>%
  select(user_id, listen_count) %>%
  group_by(user_id) %>%
  summarise(number_songs = n(), 
            mean_listen_count = mean(listen_count), 
            sum_listen_count = sum(listen_count))

grouped_song <- joined %>% 
  select(song_id, title, artist_name) %>% 
  group_by(title)
describe(grouped_id)
##                   vars     n     mean       sd median  trimmed      mad min
## user_id*             1 68877 34439.00 19883.22  34439 34439.00 25528.89   1
## number_songs         2 68877     6.07     7.09      4     4.66     2.97   1
## mean_listen_count    3 68877     3.27     5.42      2     2.33     1.48   1
## sum_listen_count     4 68877    18.74    29.57      9    12.57    10.38   1
##                     max range  skew kurtosis    se
## user_id*          68877 68876  0.00    -1.20 75.76
## number_songs        144   143  3.95    28.35  0.03
## mean_listen_count   401   400 18.87   873.51  0.02
## sum_listen_count    951   950  5.54    65.81  0.11
msd=grouped_id
# High-level statistics on listeners
describe(grouped_id) %>% kable()
vars n mean sd median trimmed mad min max range skew kurtosis se
user_id* 1 68877 34439.000000 19883.221582 34439 34439.000000 25528.8894 1 68877 68876 0.000000 -1.200052 75.7616878
number_songs 2 68877 6.070328 7.086518 4 4.658875 2.9652 1 144 143 3.951246 28.350354 0.0270020
mean_listen_count 3 68877 3.270669 5.416100 2 2.330281 1.4826 1 401 400 18.870007 873.506563 0.0206371
sum_listen_count 4 68877 18.739652 29.571997 9 12.565142 10.3782 1 951 950 5.538541 65.806135 0.1126791

Checking null values.

sum(is.null(msd))
## [1] 0
md.pattern(msd)
##  /\     /\
## {  `---'  }
## {  O   O  }
## ==>  V <==  No need for mice. This data set is completely observed.
##  \  \|/  /
##   `-----'

##       user_id number_songs mean_listen_count sum_listen_count  
## 68877       1            1                 1                1 0
##             0            0                 0                0 0
# Compare total songs and listeners
ggplot(data = grouped_id, aes(number_songs)) + 
  geom_histogram(binwidth = 1) +
  labs(title = "How people listen: songs vs. listeners", x = "Unique songs", y = "Total listeners")

The above Histogram depicts the remarkable skew of the dataset,

# Comparing total songs and listeners below 100 songs
ggplot(data = grouped_id, aes(number_songs)) + 
  geom_histogram(breaks = seq(1, 100, by = 1)) +
  labs(title = "How people listen: songs vs. listeners", subtitle = "<100 songs (detail)", x = "Unique songs", y = "Total listeners")

# Compare total songs and total listens
ggplot(data = grouped_id, aes(x = number_songs, y = sum_listen_count)) +
  geom_point() +
  geom_smooth(method = "loess", se = F) +
  xlim(c(0, 8000)) +
  ylim(c(0, 8000)) +
  labs(title = "How people listen: songs vs. listens", x = "Unique songs", y = "Total listens")
## `geom_smooth()` using formula 'y ~ x'

# Number of unique songs.
length(unique(joined$song_id))
## [1] 1994
# Earliest recordings (correcting for null values coded as 0)
min(joined$year[which(joined$year > 0)])
## [1] 1958
# Total number of listens
sum(joined$listen_count)
## [1] 1290731
# High-level statistics on songs
describe(joined$listen_count)
##    vars      n mean   sd median trimmed mad min max range  skew kurtosis   se
## X1    1 418106 3.09 6.39      1    1.88   0   1 649   648 17.61   825.98 0.01
# Compare total listens and unique listeners
joined %>% 
  select(user_id, song_id, listen_count) %>% 
  group_by(song_id) %>% 
  summarise(total_listens = sum(listen_count), unique_listeners = n_distinct(user_id)) %>%
  ggplot(aes(x = total_listens, y = unique_listeners)) +
  geom_point() +
  geom_smooth(method = "loess", se = F) +
  xlim(c(0, 8500)) +
  ylim(c(0, 6000)) +
  labs(title = "How songs are listened to: unique songs vs. total listens", x = "Total listens", y = "Unique listeners")
## `geom_smooth()` using formula 'y ~ x'

Calculating the reating and filters,

# Join total listen count to the full dataframe.
joined2 <- left_join(joined, grouped_id, by = "user_id")

# Create a new column to hold a calculated implicit rating (as a number from 0 to 100) of user preference for a song. 
joined_final <- mutate(joined2, rating = round((joined2$listen_count / joined2$sum_listen_count)*100, 2))
# Filter out users with a single song rating. Include users who have a diverse set of ratings.
joined_final <- filter(joined_final, rating<100, mean_listen_count>2, number_songs>=15, year>0)

head(joined_final)  %>% 
  kable("html")     %>% 
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
user_id song_id listen_count title release artist_name year number_songs mean_listen_count sum_listen_count rating
5a905f000fc1ff3df7ca807d57edb608863db05d SOAFTRR12AF72A8D4D 1 Harder Better Faster Stronger Discovery Daft Punk 2007 88 2.670454 235 0.43
5a905f000fc1ff3df7ca807d57edb608863db05d SOAJJDS12A8C13A3FB 1 I Got Mine Attack & Release The Black Keys 2008 88 2.670454 235 0.43
5a905f000fc1ff3df7ca807d57edb608863db05d SOAKDHD12A6310F1AE 1 Face To Face (Cosmo VItelli Remix) Daft Club Daft Punk 2003 88 2.670454 235 0.43
5a905f000fc1ff3df7ca807d57edb608863db05d SOAUBGU12A6701C57A 2 Swallowed In The Sea X & Y Coldplay 2005 88 2.670454 235 0.85
5a905f000fc1ff3df7ca807d57edb608863db05d SOBDMNP12AF72AB1E1 2 Indo Silver Club Homework Daft Punk 1996 88 2.670454 235 0.85
5a905f000fc1ff3df7ca807d57edb608863db05d SOCHPFL12AF72A3F64 2 Full Circle (Explicit) Full Circle Drowning Pool 2007 88 2.670454 235 0.85
hist(joined_final$rating)

head(joined_final,n=2)
##                                    user_id            song_id listen_count
## 1 5a905f000fc1ff3df7ca807d57edb608863db05d SOAFTRR12AF72A8D4D            1
## 2 5a905f000fc1ff3df7ca807d57edb608863db05d SOAJJDS12A8C13A3FB            1
##                           title          release    artist_name year
## 1 Harder Better Faster Stronger        Discovery      Daft Punk 2007
## 2                    I Got Mine Attack & Release The Black Keys 2008
##   number_songs mean_listen_count sum_listen_count rating
## 1           88          2.670455              235   0.43
## 2           88          2.670455              235   0.43
md.pattern(joined_final)
##  /\     /\
## {  `---'  }
## {  O   O  }
## ==>  V <==  No need for mice. This data set is completely observed.
##  \  \|/  /
##   `-----'

##       user_id song_id listen_count title release artist_name year number_songs
## 59556       1       1            1     1       1           1    1            1
##             0       0            0     0       0           0    0            0
##       mean_listen_count sum_listen_count rating  
## 59556                 1                1      1 0
##                       0                0      0 0

Observing the outliners

boxp1<-ggplot(joined_final, aes(x =number_songs, y=sum_listen_count))
# Adding the geometric object box plot
boxp1+geom_boxplot()
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?

boxplot(joined_final$listen_count)

boxplot(joined_final$rating)

temp=joined_final
outlierKD <- function(dt, var) {
  var_name <- eval(substitute(var),eval(dt))
  tot <- sum(!is.na(var_name))
  na1 <- sum(is.na(var_name))
  m1 <- mean(var_name, na.rm = T)
  par(mfrow=c(2, 2), oma=c(0,0,3,0))
  boxplot(var_name, main="With outliers")
  hist(var_name, main="With outliers", xlab=NA, ylab=NA)
  outlier <- boxplot.stats(var_name)$out
  mo <- mean(outlier)
  var_name <- ifelse(var_name %in% outlier, NA, var_name)
  boxplot(var_name, main="Without outliers")
  hist(var_name, main="Without outliers", xlab=NA, ylab=NA)
  title("Outlier Check", outer=TRUE)
  na2 <- sum(is.na(var_name))
  message("Outliers identified: ", na2 - na1, " from ", tot, " observations")
  message("Proportion (%) of outliers: ", (na2 - na1) / tot*100)
  message("Mean of the outliers: ", mo)
  m2 <- mean(var_name, na.rm = T)
  message("Mean without removing outliers: ", m1)
  message("Mean if we remove outliers: ", m2)
  response <- readline(prompt="Do you want to remove outliers and to replace with NA? [yes/no]: ")
  if(response == "y" | response == "yes"){
    dt[as.character(substitute(var))] <- invisible(var_name)
    assign(as.character(as.list(match.call())$dt), dt, envir = .GlobalEnv)
    message("Outliers successfully removed", "\n")
    return(invisible(dt))
  } else{
    message("Nothing changed", "\n")
    return(invisible(var_name))
  }
}

As, we’ll be using KNN Algo for our Recommendation, so these outliners will also be necessary in this scenario, as we cannot excludo those people who have a different taste of music. So, In our preprocessing, we combined the dataset, saw their relations etc.

outlierKD(temp, rating)
## Outliers identified: 5579 from 59556 observations
## Proportion (%) of outliers: 9.36765397273155
## Mean of the outliers: 17.5026008245205
## Mean without removing outliers: 4.11063469675599
## Mean if we remove outliers: 2.72645663893881
## Do you want to remove outliers and to replace with NA? [yes/no]:
## Nothing changed

outlierKD(temp, number_songs)
## Outliers identified: 3337 from 59556 observations
## Proportion (%) of outliers: 5.60312982738935
## Mean of the outliers: 81.0677255019479
## Mean without removing outliers: 29.1970078581503
## Mean if we remove outliers: 26.118109535922
## Do you want to remove outliers and to replace with NA? [yes/no]:
## Nothing changed

cor(temp$listen_count, temp$rating)
## [1] 0.7798603

The total no. of listening and the song song ratings are highly correlated.

s<-temp%>%dplyr::select(listen_count,year,number_songs,mean_listen_count,sum_listen_count,rating)
library(lattice)
library(reshape2)
# rounding to 2 decimal places
corr_mat <- round(cor(s),2) 
melted_corr_mat <- melt(corr_mat)
# plotting the correlation heatmap
library(ggplot2)
ggplot(data = melted_corr_mat, aes(x=Var1, y=Var2,
                                   fill=value)) +
geom_tile() +
geom_text(aes(Var2, Var1, label = value),
          color = "black", size = 4)

# Load and install heatmaply package
#install.packages("heatmaply")
library(heatmaply)
## Loading required package: plotly
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:igraph':
## 
##     groups
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
## Loading required package: viridis
## Loading required package: viridisLite
## 
## ======================
## Welcome to heatmaply version 1.3.0
## 
## Type citation('heatmaply') for how to cite the package.
## Type ?heatmaply for the main documentation.
## 
## The github page is: https://github.com/talgalili/heatmaply/
## Please submit your suggestions and bug-reports at: https://github.com/talgalili/heatmaply/issues
## You may ask questions at stackoverflow, use the r and heatmaply tags: 
##   https://stackoverflow.com/questions/tagged/heatmaply
## ======================
## 
## Attaching package: 'heatmaply'
## The following object is masked from 'package:recommenderlab':
## 
##     normalize
## The following object is masked from 'package:igraph':
## 
##     normalize
# plotting corr heatmap
heatmaply_cor(x = cor(s), xlab = "Features",
              ylab = "Features", k_col = 2, k_row = 2)
#Exporting it, and then Downloading it for our further process
write.csv(joined_final,file="MSD_PrePrcsd_SSD.csv", row.names = FALSE)
# Addin# Create subdirectory in working directory to house Shiny app
dir <- getwd()
dir.app <- (file.path(dir, "App"))
if (!dir.exists(dir.app)){
  dir.create(dir.app)
  print(paste0("Shiny app directory created: ", dir.app))
} else {
    print("Shiny app directory already exists")
}
## [1] "Shiny app directory already exists"
library(magrittr)
library(stringr)
library(tidyr)
library(knitr)
#install.packages("kableExtra")
library(kableExtra)
library(ggplot2)
library(devtools)
#devtools::install_github("nicolewhite/RNeo4j")
#install.packages("RNeo4j")
#library(RNeo4j)
library(recommenderlab)
#install.packages("psych")
library(psych)
library(rstudioapi)
library(knitr)
library(kableExtra)